1 Load data

Load the cleaned data from data_preparation.rmd. Assumes koi_data.Rda contains the necessary columns.

data_file_path <- "data/Rdas/koi_data.Rda"
koi_data_raw <- readRDS(data_file_path)

2 Logistic regression models

2.1 Select columns

Define target and predictor columns. Review this list carefully.

target_variable_name <- "koi_pdisposition" # Or 'koi_disposition'
predictor_cols <- c(
  # Numeric Predictors (Examples)
  "koi_period", "koi_duration", "koi_depth", "koi_prad", "koi_teq",
  "koi_insol", "koi_model_snr", "koi_steff", "koi_slogg", "koi_srad",
  "koi_smass", "koi_impact", "koi_ror", "koi_srho", "koi_sma", "koi_incl",
  "koi_dor", "koi_ldm_coeff1", "koi_ldm_coeff2", "koi_smet"
)
selected_cols <- c(target_variable_name, predictor_cols)

# Check which selected columns actually exist in the loaded data
selected_cols_exist <- selected_cols[selected_cols %in% names(koi_data_raw)]
missing_cols <- setdiff(selected_cols, selected_cols_exist)
if (length(missing_cols) > 0) {
  print(paste("Warning: The following selected columns were not found:", paste(missing_cols, collapse = ", ")))
}
if (!target_variable_name %in% selected_cols_exist) {
  stop(paste("Target variable", target_variable_name, "not found in the data!"))
}

# Subset the data
koi_data <- koi_data_raw[, selected_cols_exist]
print(paste("Selected", ncol(koi_data), "columns for analysis."))
## [1] "Selected 21 columns for analysis."

2.2 Split data into training and test sets

Stratified split based on the target variable.

set.seed(42) # for reproducibility
train_indices <- createDataPartition(koi_data[[target_variable_name]], p = 0.8, list = FALSE)
train_data <- koi_data[train_indices, ]
test_data <- koi_data[-train_indices, ]
print(paste("Training set size:", nrow(train_data)))
## [1] "Training set size: 6399"
print(paste("Test set size:", nrow(test_data)))
## [1] "Test set size: 1599"

2.3 Handle Missing Values (NA) - AFTER Splitting

Apply simple median/mode imputation. Consider more advanced methods if NAs are numerous or potentially non-random.

# --- Impute Training Data ---
numeric_predictors_train <- names(train_data)[sapply(train_data, is.numeric)]
factor_predictors_train <- names(train_data)[sapply(train_data, function(x) is.character(x) || is.factor(x))]
factor_predictors_train <- setdiff(factor_predictors_train, target_variable_name)

# Impute numeric with median (calculated from training data)
train_medians <- list()
for (col in numeric_predictors_train) {
  if (any(is.na(train_data[[col]]))) {
    median_val <- median(train_data[[col]], na.rm = TRUE)
    if (is.na(median_val)) {
      median_val <- 0
      print(paste("Warning: Train median NA for", col))
    }
    train_data[[col]][is.na(train_data[[col]])] <- median_val
    train_medians[[col]] <- median_val # Store median for applying to test set
  }
}

# Impute character/factor with mode (calculated from training data)
train_modes <- list()
for (col in factor_predictors_train) {
  if (any(is.na(train_data[[col]]))) {
    mode_val <- calculate_mode(train_data[[col]], na.rm = TRUE)
    if (is.na(mode_val)) {
      mode_val <- "Missing"
      print(paste("Warning: Train mode NA for", col))
    }
    train_data[[col]][is.na(train_data[[col]])] <- mode_val
    train_modes[[col]] <- mode_val # Store mode for applying to test set
  }
}

# Remove rows with NA in target (TRAIN)
rows_before_na_target_train <- nrow(train_data)
train_data <- train_data %>% filter(!is.na(.data[[target_variable_name]]))
if (nrow(train_data) < rows_before_na_target_train) print("Removed rows with NA target in TRAIN")

# --- Impute Test Data (using values from training data) ---
for (col in names(train_medians)) {
  if (col %in% names(test_data) && any(is.na(test_data[[col]]))) {
    test_data[[col]][is.na(test_data[[col]])] <- train_medians[[col]]
    # print(paste("Imputed NAs in TEST:", col))
  }
}
# Impute character/factor with *training* mode
for (col in names(train_modes)) {
  if (col %in% names(test_data) && any(is.na(test_data[[col]]))) {
    test_data[[col]][is.na(test_data[[col]])] <- train_modes[[col]]
    # print(paste("Imputed NAs in TEST:", col))
  }
}
# Handle any remaining NAs in test set predictors if medians/modes couldn't be calculated or applied
na_check_test <- colSums(is.na(test_data %>% select(-all_of(target_variable_name))))
if (any(na_check_test > 0)) {
  print("Warning: NAs still present in test set predictors after imputation:")
  print(na_check_test[na_check_test > 0])
  # Consider removing these test rows or using a more robust imputation
  test_data <- test_data[complete.cases(test_data %>% select(-all_of(target_variable_name))), ]
  print("Removed test rows with remaining NAs in predictors.")
}

# Remove rows with NA in target (TEST)
rows_before_na_target_test <- nrow(test_data)
test_data <- test_data %>% filter(!is.na(.data[[target_variable_name]]))
if (nrow(test_data) < rows_before_na_target_test) print("Removed rows with NA target in TEST")

2.4 Convert Predictors and Target to Factors

Ensure categorical predictors and the target variable are factors with consistent levels.

factor_cols_to_convert <- c(
  "koi_fpflag_nt", "koi_fpflag_ss", "koi_fpflag_co", "koi_fpflag_ec" # Flags
)
factor_cols_exist_train <- factor_cols_to_convert[factor_cols_to_convert %in% names(train_data)]

# Convert predictor factors in training data
if (length(factor_cols_exist_train) > 0) {
  train_data <- train_data %>%
    mutate(across(all_of(factor_cols_exist_train), as.factor))
  print(paste("Converted predictor(s) to factor in train data:", paste(factor_cols_exist_train, collapse = ", ")))
}

# Convert predictor factors in test data (using levels from training data if possible)
factor_cols_exist_test <- factor_cols_to_convert[factor_cols_to_convert %in% names(test_data)]
if (length(factor_cols_exist_test) > 0) {
  for (col in factor_cols_exist_test) {
    # Ensure levels in test match train, adding if necessary
    train_levels <- levels(train_data[[col]])
    test_data[[col]] <- factor(test_data[[col]], levels = train_levels)
  }
  print(paste("Converted predictor(s) to factor in test data:", paste(factor_cols_exist_test, collapse = ", ")))
}

# Convert target variable to factor with desired levels and labels
train_data[[target_variable_name]] <- factor(train_data[[target_variable_name]],
  levels = c("CANDIDATE", "FALSE POSITIVE"),
  labels = c("candidate", "false_positive")
)
test_data[[target_variable_name]] <- factor(test_data[[target_variable_name]],
  levels = c("CANDIDATE", "FALSE POSITIVE"),
  labels = c("candidate", "false_positive")
)

# Define positive class based on the new factor levels
positive_class <- levels(train_data[[target_variable_name]])[2]
if (is.na(positive_class)) stop("Could not determine positive class level after factoring.")
print(paste("Positive class for evaluation:", positive_class))
## [1] "Positive class for evaluation: false_positive"
# Check levels of predictor factors
for (var in factor_cols_exist_train) {
  print(paste("Levels in", var, "(Train):"))
  print(table(train_data[[var]]))
}

2.5 Scale Numerical Predictors

Scale numeric predictors AFTER splitting and handling NAs/factors. Fit scaler only on training data.

numeric_predictors_final <- names(train_data)[sapply(train_data, is.numeric)]

# Check for zero variance columns before scaling
nzv <- nearZeroVar(train_data[, numeric_predictors_final], saveMetrics = TRUE)
cols_to_scale <- rownames(nzv)[!nzv$zeroVar]
if (length(cols_to_scale) < length(numeric_predictors_final)) {
  print("Warning: Found zero-variance numeric columns, excluding from scaling:")
  print(rownames(nzv)[nzv$zeroVar])
}

if (length(cols_to_scale) > 0) {
  scaler <- preProcess(train_data[, cols_to_scale], method = c("center", "scale"))
  train_data_scaled <- predict(scaler, train_data)
  test_data_scaled <- predict(scaler, test_data)
  print(paste("Scaled numeric predictors:", paste(cols_to_scale, collapse = ", ")))
} else {
  print("No non-zero variance numeric predictors found to scale.")
  train_data_scaled <- train_data
  test_data_scaled <- test_data
}
## [1] "Scaled numeric predictors: koi_period, koi_duration, koi_depth, koi_prad, koi_teq, koi_insol, koi_model_snr, koi_steff, koi_slogg, koi_srad, koi_smass, koi_impact, koi_ror, koi_srho, koi_sma, koi_incl, koi_dor, koi_ldm_coeff1, koi_ldm_coeff2, koi_smet"

2.6 Model Fitting

Define the formula and fit the GLM.

glm_original_formula <- as.formula(paste(target_variable_name, "~ ."))
print(paste("Using formula:", deparse(glm_original_formula)))
## [1] "Using formula: koi_pdisposition ~ ."
glm_original_model <- glm(glm_original_formula,
  data = train_data_scaled,
  family = binomial(link = "logit")
)

summary(glm_original_model)
## 
## Call:
## glm(formula = glm_original_formula, family = binomial(link = "logit"), 
##     data = train_data_scaled)
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     1.09986    0.09556  11.510  < 2e-16 ***
## koi_period      0.25772    0.21190   1.216  0.22390    
## koi_duration    1.15093    0.09986  11.526  < 2e-16 ***
## koi_depth       2.19920    0.40141   5.479 4.28e-08 ***
## koi_prad        0.25948    0.38662   0.671  0.50212    
## koi_teq         1.82665    0.11695  15.619  < 2e-16 ***
## koi_insol      -0.44363    0.05698  -7.786 6.90e-15 ***
## koi_model_snr  -0.04758    0.08904  -0.534  0.59313    
## koi_steff      -0.48874    0.15495  -3.154  0.00161 ** 
## koi_slogg       0.67690    0.08798   7.694 1.43e-14 ***
## koi_srad        0.01393    0.07724   0.180  0.85685    
## koi_smass       0.44922    0.09916   4.530 5.89e-06 ***
## koi_impact      0.53968    0.10365   5.207 1.92e-07 ***
## koi_ror         2.55320    0.49385   5.170 2.34e-07 ***
## koi_srho        0.02435    0.03015   0.808  0.41923    
## koi_sma        -0.29034    0.24926  -1.165  0.24411    
## koi_incl       -0.41634    0.09885  -4.212 2.53e-05 ***
## koi_dor         0.58369    0.08070   7.233 4.74e-13 ***
## koi_ldm_coeff1 -1.05599    0.46039  -2.294  0.02181 *  
## koi_ldm_coeff2 -0.98381    0.36463  -2.698  0.00697 ** 
## koi_smet       -0.56487    0.05652  -9.995  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 8870.4  on 6398  degrees of freedom
## Residual deviance: 5199.5  on 6378  degrees of freedom
## AIC: 5241.5
## 
## Number of Fisher Scoring iterations: 9

2.7 Model Performance

Predict on the test set and evaluate.

original_probs <- predict(glm_original_model, newdata = test_data_scaled, type = "response")
# Ensure probabilities are valid
original_probs <- pmax(pmin(original_probs, 1 - 1e-15), 1e-15)

# Convert probabilities to class predictions using a 0.5 threshold
threshold <- 0.5
# Use the levels from the *test* data target factor for creating prediction factor
test_target_levels <- levels(test_data_scaled[[target_variable_name]])
negative_class <- test_target_levels[1] # Should be 'candidate'
# Note: positive_class was defined earlier based on train_data levels

original_preds <- factor(ifelse(original_probs > threshold, positive_class, negative_class),
  levels = test_target_levels
)

test_target_factor <- test_data_scaled[[target_variable_name]]

cm <- confusionMatrix(original_preds,
  test_target_factor,
  positive = positive_class
) # Ensure positive class is correctly specified
cm
## Confusion Matrix and Statistics
## 
##                 Reference
## Prediction       candidate false_positive
##   candidate            708            198
##   false_positive        98            595
##                                          
##                Accuracy : 0.8149         
##                  95% CI : (0.795, 0.8336)
##     No Information Rate : 0.5041         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.6294         
##                                          
##  Mcnemar's Test P-Value : 8.702e-09      
##                                          
##             Sensitivity : 0.7503         
##             Specificity : 0.8784         
##          Pos Pred Value : 0.8586         
##          Neg Pred Value : 0.7815         
##              Prevalence : 0.4959         
##          Detection Rate : 0.3721         
##    Detection Prevalence : 0.4334         
##       Balanced Accuracy : 0.8144         
##                                          
##        'Positive' Class : false_positive 
## 

Plot confusion matrix

cm_data <- as.data.frame(cm$table)
# Create heatmap
ggplot(cm_data, aes(x = Reference, y = Prediction, fill = Freq)) +
  geom_tile(colour = "white", linewidth = 0.5) + # Add lines between tiles
  geom_text(aes(label = Freq), color = "white", size = 6) + # Adjust text size
  # Use a color scheme less prone to issues for colorblindness if possible
  scale_fill_gradient(low = "lightblue", high = "darkblue", name = "Frequency") +
  theme_minimal(base_size = 12) + # Set base font size
  labs(
    title = "Confusion Matrix Heatmap", # Corrected title
    x = "Actual Class",
    y = "Predicted Class"
  ) +
  theme(
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5), # Center title
    axis.text.x = element_text(angle = 0, vjust = 1, hjust = 0.5), # Adjust text angle if needed
    axis.title = element_text(size = 14),
    legend.title = element_text(size = 12),
    legend.text = element_text(size = 10)
  )

ROC curve

roc_original <- roc(
  response = test_target_factor,
  predictor = original_probs,
  levels = test_target_levels
) # Specify levels explicitly
print(paste("GLM Original Features - AUC:", round(auc(roc_original), 4)))
## [1] "GLM Original Features - AUC: 0.8882"

Plot ROC curve

plot(roc_original,
  main = "ROC Curve - GLM Original Features",
  col = "blue",
  lwd = 2,
  print.auc = TRUE,
  print.auc.y = 0.75,
  print.auc.x = 0.75
)

2.8 Outlier detection

Check for influential points using cook’s distance.

influenceIndexPlot(glm_original_model, vars = "C")

Find and remove all influential points with Cook’s distance > 0.5

cooks_d <- cooks.distance(glm_original_model)
outlier_indices <- which(cooks_d > 0.5)
print(paste("Number of influential points to remove:", length(outlier_indices)))
## [1] "Number of influential points to remove: 2"
# Remove all influential points
if (length(outlier_indices) > 0) {
  train_data_scaled <- train_data_scaled[-outlier_indices, ]
  print(paste("Removed", length(outlier_indices), "influential points"))
}
## [1] "Removed 2 influential points"

Refit the GLM with the updated training data.

glm_original_model <- glm(glm_original_formula,
  data = train_data_scaled,
  family = binomial(link = "logit")
)
summary(glm_original_model)
## 
## Call:
## glm(formula = glm_original_formula, family = binomial(link = "logit"), 
##     data = train_data_scaled)
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     1.14156    0.08507  13.419  < 2e-16 ***
## koi_period      0.29864    0.21669   1.378  0.16814    
## koi_duration    1.29810    0.10383  12.502  < 2e-16 ***
## koi_depth       0.77960    0.34790   2.241  0.02503 *  
## koi_prad        0.01257    0.24641   0.051  0.95933    
## koi_teq         1.99661    0.12031  16.595  < 2e-16 ***
## koi_insol      -0.83265    0.09432  -8.828  < 2e-16 ***
## koi_model_snr  -0.08312    0.08217  -1.012  0.31171    
## koi_steff      -0.51090    0.15898  -3.214  0.00131 ** 
## koi_slogg       0.75842    0.08950   8.474  < 2e-16 ***
## koi_srad        0.08568    0.07305   1.173  0.24087    
## koi_smass       0.51646    0.10212   5.057 4.25e-07 ***
## koi_impact      0.40769    0.10601   3.846  0.00012 ***
## koi_ror         5.73441    0.50313  11.398  < 2e-16 ***
## koi_srho        0.02551    0.03030   0.842  0.39986    
## koi_sma        -0.40431    0.25554  -1.582  0.11361    
## koi_incl       -0.25444    0.10093  -2.521  0.01170 *  
## koi_dor         0.65097    0.08326   7.819 5.35e-15 ***
## koi_ldm_coeff1 -1.01362    0.47104  -2.152  0.03141 *  
## koi_ldm_coeff2 -0.95850    0.37285  -2.571  0.01015 *  
## koi_smet       -0.57885    0.05791  -9.996  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 8867.7  on 6396  degrees of freedom
## Residual deviance: 5051.9  on 6376  degrees of freedom
## AIC: 5093.9
## 
## Number of Fisher Scoring iterations: 8

Check outliers again.

influenceIndexPlot(glm_original_model, vars = "C")

Find and remove all influential points with Cook’s distance > 0.5

cooks_d <- cooks.distance(glm_original_model)
outlier_indices <- which(cooks_d > 0.5)
print(paste("Number of influential points to remove:", length(outlier_indices)))
## [1] "Number of influential points to remove: 1"
# Remove all influential points
if (length(outlier_indices) > 0) {
  train_data_scaled <- train_data_scaled[-outlier_indices, ]
  print(paste("Removed", length(outlier_indices), "influential points"))
}
## [1] "Removed 1 influential points"

Refit the GLM with the updated training data.

glm_original_model <- glm(glm_original_formula,
  data = train_data_scaled,
  family = binomial(link = "logit")
)
summary(glm_original_model)
## 
## Call:
## glm(formula = glm_original_formula, family = binomial(link = "logit"), 
##     data = train_data_scaled)
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     1.13220    0.08512  13.301  < 2e-16 ***
## koi_period      0.27949    0.21694   1.288  0.19763    
## koi_duration    1.30688    0.10405  12.561  < 2e-16 ***
## koi_depth       0.77501    0.34777   2.229  0.02585 *  
## koi_prad        0.01662    0.26552   0.063  0.95008    
## koi_teq         2.02544    0.12105  16.732  < 2e-16 ***
## koi_insol      -1.15821    0.12615  -9.181  < 2e-16 ***
## koi_model_snr  -0.08300    0.08220  -1.010  0.31262    
## koi_steff      -0.51294    0.15913  -3.223  0.00127 ** 
## koi_slogg       0.76720    0.08972   8.551  < 2e-16 ***
## koi_srad        0.09537    0.07306   1.305  0.19175    
## koi_smass       0.51567    0.10259   5.026 5.00e-07 ***
## koi_impact      0.41457    0.10607   3.908 9.29e-05 ***
## koi_ror         5.74178    0.51005  11.257  < 2e-16 ***
## koi_srho        0.02530    0.03031   0.835  0.40399    
## koi_sma        -0.38305    0.25592  -1.497  0.13446    
## koi_incl       -0.24436    0.10075  -2.426  0.01529 *  
## koi_dor         0.65519    0.08338   7.858 3.91e-15 ***
## koi_ldm_coeff1 -1.00526    0.47154  -2.132  0.03302 *  
## koi_ldm_coeff2 -0.95198    0.37324  -2.551  0.01075 *  
## koi_smet       -0.58088    0.05801 -10.013  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 8866.2  on 6395  degrees of freedom
## Residual deviance: 5045.5  on 6375  degrees of freedom
## AIC: 5087.5
## 
## Number of Fisher Scoring iterations: 8

Check outliers again.

influenceIndexPlot(glm_original_model, vars = "C")

Now the model should be good.

2.9 Model Evaluation without outliers

Predict on the test set and evaluate.

original_probs <- predict(glm_original_model, newdata = test_data_scaled, type = "response")
# Ensure probabilities are valid
original_probs <- pmax(pmin(original_probs, 1 - 1e-15), 1e-15)

# Convert probabilities to class predictions using a 0.5 threshold
threshold <- 0.5
# Use the levels from the *test* data target factor for creating prediction factor
test_target_levels <- levels(test_data_scaled[[target_variable_name]])
negative_class <- test_target_levels[1] # Should be 'candidate'
# Note: positive_class was defined earlier based on train_data levels

original_preds <- factor(ifelse(original_probs > threshold, positive_class, negative_class),
  levels = test_target_levels
)

test_target_factor <- test_data_scaled[[target_variable_name]]

cm <- confusionMatrix(original_preds,
  test_target_factor,
  positive = positive_class
) # Ensure positive class is correctly specified
cm
## Confusion Matrix and Statistics
## 
##                 Reference
## Prediction       candidate false_positive
##   candidate            708            186
##   false_positive        98            607
##                                           
##                Accuracy : 0.8224          
##                  95% CI : (0.8028, 0.8408)
##     No Information Rate : 0.5041          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.6444          
##                                           
##  Mcnemar's Test P-Value : 2.437e-07       
##                                           
##             Sensitivity : 0.7654          
##             Specificity : 0.8784          
##          Pos Pred Value : 0.8610          
##          Neg Pred Value : 0.7919          
##              Prevalence : 0.4959          
##          Detection Rate : 0.3796          
##    Detection Prevalence : 0.4409          
##       Balanced Accuracy : 0.8219          
##                                           
##        'Positive' Class : false_positive  
## 

Plot confusion matrix

cm_data <- as.data.frame(cm$table)
# Create heatmap
ggplot(cm_data, aes(x = Reference, y = Prediction, fill = Freq)) +
  geom_tile(colour = "white", linewidth = 0.5) + # Add lines between tiles
  geom_text(aes(label = Freq), color = "white", size = 6) + # Adjust text size
  # Use a color scheme less prone to issues for colorblindness if possible
  scale_fill_gradient(low = "lightblue", high = "darkblue", name = "Frequency") +
  theme_minimal(base_size = 12) + # Set base font size
  labs(
    title = "Confusion Matrix Heatmap", # Corrected title
    x = "Actual Class",
    y = "Predicted Class"
  ) +
  theme(
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5), # Center title
    axis.text.x = element_text(angle = 0, vjust = 1, hjust = 0.5), # Adjust text angle if needed
    axis.title = element_text(size = 14),
    legend.title = element_text(size = 12),
    legend.text = element_text(size = 10)
  )

ROC curve

roc_original <- roc(
  response = test_target_factor,
  predictor = original_probs,
  levels = test_target_levels
) # Specify levels explicitly
print(paste("GLM Original Features - AUC:", round(auc(roc_original), 4)))
## [1] "GLM Original Features - AUC: 0.8874"

Plot ROC curve

plot(roc_original,
  main = "ROC Curve - GLM Original Features",
  col = "blue",
  lwd = 2,
  print.auc = TRUE,
  print.auc.y = 0.75,
  print.auc.x = 0.75
)

2.10 Residual analysis

  • Plot 1: Residuals vs Fitted (Linear Predictor) Look for a lack of strong patterns, random scatter around the smoothed line (though some banding is expected for binary data). Curvature might indicate link function issues or missing non-linear terms.
  • Plot 3: Residuals vs Leverage Look for points with high leverage (far right) AND large standardized residuals (far top/bottom). Points outside Cook’s distance contours (dashed lines) are influential.
plot(glm_original_model, which = c(1, 5))

Calculates Variance Inflation Factors. Important when using original predictors. High VIF (> 5 or > 10) suggests multicollinearity might be inflating standard errors of coefficients, making them unstable/unreliable.

vif_values <- vif(glm_original_model)
print("VIF Values:")
## [1] "VIF Values:"
print(vif_values)
##     koi_period   koi_duration      koi_depth       koi_prad        koi_teq 
##      45.365541       3.146117       2.405060       2.099924       5.875834 
##      koi_insol  koi_model_snr      koi_steff      koi_slogg       koi_srad 
##       1.658242       1.735790      15.228120       4.938804       2.532314 
##      koi_smass     koi_impact        koi_ror       koi_srho        koi_sma 
##       5.751579       2.101254       2.885427       1.112261      63.188275 
##       koi_incl        koi_dor koi_ldm_coeff1 koi_ldm_coeff2       koi_smet 
##       2.014036       4.972557     173.500591     113.851021       2.123956
# Check for high values
max_vif <- max(vif_values, na.rm = TRUE)
print(paste("Max VIF:", round(max_vif, 2)))
## [1] "Max VIF: 173.5"

From the VIF values, we notice the following: 1. Physical Relationships: koi_period and koi_sma are expected to be highly correlated due to Kepler’s Third Law. 2. Parameter Dependencies: Limb darkening coefficients (koi_ldm_coeff1, koi_ldm_coeff2) are often highly correlated with each other and can also be correlated with stellar temperature (koi_steff). Stellar parameters (koi_steff, koi_teq, koi_smass) are often correlated among themselves.

We proceed by removing highly correlated predictors from the model.

# Remove highly correlated predictors
predictors_to_remove <- c("koi_sma", "koi_ldm_coeff2")
train_data_scaled <- train_data_scaled[, !names(train_data_scaled) %in% predictors_to_remove]
test_data_scaled <- test_data_scaled[, !names(test_data_scaled) %in% predictors_to_remove]

# Update predictor_cols
predictor_cols <- predictor_cols[!predictor_cols %in% predictors_to_remove]

# Refit the model with updated data
glm_original_model <- glm(glm_original_formula,
  data = train_data_scaled,
  family = binomial(link = "logit")
)
summary(glm_original_model)
## 
## Call:
## glm(formula = glm_original_formula, family = binomial(link = "logit"), 
##     data = train_data_scaled)
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     1.12710    0.08484  13.286  < 2e-16 ***
## koi_period     -0.02010    0.07338  -0.274  0.78415    
## koi_duration    1.26967    0.10099  12.573  < 2e-16 ***
## koi_depth       0.78093    0.34724   2.249  0.02452 *  
## koi_prad        0.01503    0.26629   0.056  0.95498    
## koi_teq         2.12831    0.10022  21.237  < 2e-16 ***
## koi_insol      -1.20622    0.12232  -9.861  < 2e-16 ***
## koi_model_snr  -0.07166    0.08178  -0.876  0.38091    
## koi_steff      -0.17450    0.08311  -2.100  0.03576 *  
## koi_slogg       0.79903    0.08851   9.028  < 2e-16 ***
## koi_srad        0.11529    0.07186   1.604  0.10861    
## koi_smass       0.46510    0.09684   4.803 1.57e-06 ***
## koi_impact      0.41540    0.10568   3.931 8.47e-05 ***
## koi_ror         5.69350    0.50833  11.200  < 2e-16 ***
## koi_srho        0.02612    0.03047   0.857  0.39119    
## koi_incl       -0.24948    0.10070  -2.477  0.01323 *  
## koi_dor         0.62047    0.07931   7.823 5.14e-15 ***
## koi_ldm_coeff1  0.18397    0.06020   3.056  0.00224 ** 
## koi_smet       -0.66341    0.04795 -13.836  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 8866.2  on 6395  degrees of freedom
## Residual deviance: 5054.1  on 6377  degrees of freedom
## AIC: 5092.1
## 
## Number of Fisher Scoring iterations: 8

2.11 Model Evaluation without correlated predictors

Predict on the test set and evaluate.

original_probs <- predict(glm_original_model, newdata = test_data_scaled, type = "response")
# Ensure probabilities are valid
original_probs <- pmax(pmin(original_probs, 1 - 1e-15), 1e-15)

# Convert probabilities to class predictions using a 0.5 threshold
threshold <- 0.5
# Use the levels from the *test* data target factor for creating prediction factor
test_target_levels <- levels(test_data_scaled[[target_variable_name]])
negative_class <- test_target_levels[1] # Should be 'candidate'
# Note: positive_class was defined earlier based on train_data levels

original_preds <- factor(ifelse(original_probs > threshold, positive_class, negative_class),
  levels = test_target_levels
)

test_target_factor <- test_data_scaled[[target_variable_name]]

cm <- confusionMatrix(original_preds,
  test_target_factor,
  positive = positive_class
) # Ensure positive class is correctly specified
cm
## Confusion Matrix and Statistics
## 
##                 Reference
## Prediction       candidate false_positive
##   candidate            704            187
##   false_positive       102            606
##                                           
##                Accuracy : 0.8193          
##                  95% CI : (0.7995, 0.8378)
##     No Information Rate : 0.5041          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.6382          
##                                           
##  Mcnemar's Test P-Value : 7.765e-07       
##                                           
##             Sensitivity : 0.7642          
##             Specificity : 0.8734          
##          Pos Pred Value : 0.8559          
##          Neg Pred Value : 0.7901          
##              Prevalence : 0.4959          
##          Detection Rate : 0.3790          
##    Detection Prevalence : 0.4428          
##       Balanced Accuracy : 0.8188          
##                                           
##        'Positive' Class : false_positive  
## 

Plot confusion matrix

cm_data <- as.data.frame(cm$table)
# Create heatmap
ggplot(cm_data, aes(x = Reference, y = Prediction, fill = Freq)) +
  geom_tile(colour = "white", linewidth = 0.5) + # Add lines between tiles
  geom_text(aes(label = Freq), color = "white", size = 6) + # Adjust text size
  # Use a color scheme less prone to issues for colorblindness if possible
  scale_fill_gradient(low = "lightblue", high = "darkblue", name = "Frequency") +
  theme_minimal(base_size = 12) + # Set base font size
  labs(
    title = "Confusion Matrix Heatmap", # Corrected title
    x = "Actual Class",
    y = "Predicted Class"
  ) +
  theme(
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5), # Center title
    axis.text.x = element_text(angle = 0, vjust = 1, hjust = 0.5), # Adjust text angle if needed
    axis.title = element_text(size = 14),
    legend.title = element_text(size = 12),
    legend.text = element_text(size = 10)
  )

ROC curve

roc_original <- roc(
  response = test_target_factor,
  predictor = original_probs,
  levels = test_target_levels
) # Specify levels explicitly
print(paste("GLM Original Features - AUC:", round(auc(roc_original), 4)))
## [1] "GLM Original Features - AUC: 0.8875"

Plot ROC curve

plot(roc_original,
  main = "ROC Curve - GLM Original Features",
  col = "blue",
  lwd = 2,
  print.auc = TRUE,
  print.auc.y = 0.75,
  print.auc.x = 0.75
)

vif_values <- vif(glm_original_model)
print("VIF Values:")
## [1] "VIF Values:"
print(vif_values)
##     koi_period   koi_duration      koi_depth       koi_prad        koi_teq 
##       5.352631       2.999511       2.404310       2.116069       4.028664 
##      koi_insol  koi_model_snr      koi_steff      koi_slogg       koi_srad 
##       1.570927       1.729637       4.067829       4.808042       2.472033 
##      koi_smass     koi_impact        koi_ror       koi_srho       koi_incl 
##       5.048217       2.095302       2.886955       1.112462       2.006146 
##        koi_dor koi_ldm_coeff1       koi_smet 
##       4.665995       2.771644       1.431394
# Check for high values
max_vif <- max(vif_values, na.rm = TRUE)
print(paste("Max VIF:", round(max_vif, 2)))
## [1] "Max VIF: 5.35"

Now the model should be good.

2.12 Summary of GLM Results

The logistic regression model was fitted using scaled original numeric features (excluding the binary flag variables) to predict the koi_pdisposition.

  1. Model Fit & Significant Predictors: The overall model shows a highly significant improvement over the null model (intercept only), indicated by the large drop in deviance (Null: 8866.2, Residual: 5045.5). Several predictors showed a statistically significant relationship (p < 0.05) with the target variable in this model: koi_duration, koi_depth, koi_teq, koi_insol, koi_steff, koi_slogg, koi_smass, koi_impact, koi_ror, koi_incl, koi_dor, koi_ldm_coeff1, koi_ldm_coeff2, and koi_smet. Notable predictors that were not statistically significant in this specific model include koi_period, koi_prad, koi_model_snr, koi_srad, koi_srho, and koi_sma. This could be due to collinearity with other included variables.

  2. Residual Analysis (Non-linearity Check): Tests for non-linearity (curvature) were performed on the model’s residuals. Significant evidence (p < 0.05) of non-linear patterns remaining in the residuals was found for the following predictors: koi_depth, koi_teq, koi_insol, koi_slogg, koi_srad, koi_smass, koi_impact, koi_sma, koi_incl, koi_dor, and koi_smet. koi_period was borderline significant. This indicates that the assumption of a purely linear relationship between these predictors and the log-odds of the outcome is likely violated.

  3. Conclusion: The GLM identifies many significant predictors for exoplanet disposition. However, the residual analysis strongly suggests that the model’s linear structure is insufficient to capture the underlying relationships for numerous variables. This indicates that model performance could potentially be improved by incorporating non-linear effects or using models better suited to capturing such complexities.

3 Model improvements

3.1 GAM model

Let’s start by fitting a GAM model in order to use smooth functions s() for predictors showing non-linearity in residual analysis.

nonlinear_predictors_gam <- c(
  "koi_depth", "koi_teq", "koi_insol", "koi_slogg",
  "koi_srad", "koi_smass", "koi_impact", "koi_sma",
  "koi_incl", "koi_dor", "koi_smet", "koi_period"
)

nonlinear_predictors_gam <- nonlinear_predictors_gam[nonlinear_predictors_gam %in% names(train_data_scaled)]

linear_predictors_gam <- setdiff(predictor_cols, nonlinear_predictors_gam)

Prepare the formula for the GAM model.

smooth_terms <- if (length(nonlinear_predictors_gam) > 0) paste(paste0("s(", nonlinear_predictors_gam, ")"), collapse = " + ") else NULL
linear_terms <- if (length(linear_predictors_gam) > 0) paste(linear_predictors_gam, collapse = " + ") else NULL

gam_formula_str <- paste(target_variable_name, "~", paste(c(smooth_terms, linear_terms), collapse = " + "))
# Remove potential trailing/leading "+" if one group was empty
gam_formula_str <- gsub("\\+ $", "", gsub("^\\+ ", "", gsub(" \\+ \\+ ", " + ", gam_formula_str)))

gam_formula <- as.formula(gam_formula_str)
print(paste("Using GAM formula:", gam_formula_str))
## [1] "Using GAM formula: koi_pdisposition ~ s(koi_depth) + s(koi_teq) + s(koi_insol) + s(koi_slogg) + s(koi_srad) + s(koi_smass) + s(koi_impact) + s(koi_incl) + s(koi_dor) + s(koi_smet) + s(koi_period) + koi_duration + koi_prad + koi_model_snr + koi_steff + koi_ror + koi_srho + koi_ldm_coeff1"

Fit the GAM model.

gam_model <- gam(gam_formula,
  data = train_data_scaled,
  family = binomial(link = "logit"),
  method = "REML"
)
summary(gam_model)
## 
## Family: binomial 
## Link function: logit 
## 
## Formula:
## koi_pdisposition ~ s(koi_depth) + s(koi_teq) + s(koi_insol) + 
##     s(koi_slogg) + s(koi_srad) + s(koi_smass) + s(koi_impact) + 
##     s(koi_incl) + s(koi_dor) + s(koi_smet) + s(koi_period) + 
##     koi_duration + koi_prad + koi_model_snr + koi_steff + koi_ror + 
##     koi_srho + koi_ldm_coeff1
## 
## Parametric coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     0.94561    0.11063   8.548  < 2e-16 ***
## koi_duration    1.73728    0.14344  12.112  < 2e-16 ***
## koi_prad       -0.28360    0.29322  -0.967  0.33345    
## koi_model_snr  -0.13978    0.08423  -1.659  0.09702 .  
## koi_steff       0.01356    0.17555   0.077  0.93843    
## koi_ror         6.64085    0.79541   8.349  < 2e-16 ***
## koi_srho       -0.00168    0.03094  -0.054  0.95669    
## koi_ldm_coeff1  0.20498    0.07090   2.891  0.00384 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                 edf Ref.df  Chi.sq p-value    
## s(koi_depth)  2.721  3.404   6.681  0.0983 .  
## s(koi_teq)    5.173  6.144 309.050  <2e-16 ***
## s(koi_insol)  1.000  1.000   0.004  0.9485    
## s(koi_slogg)  4.254  5.476  53.962  <2e-16 ***
## s(koi_srad)   6.971  7.765   3.574  0.8206    
## s(koi_smass)  1.002  1.004   0.513  0.4749    
## s(koi_impact) 1.817  2.077  14.228  0.0305 *  
## s(koi_incl)   2.289  2.865   3.743  0.3121    
## s(koi_dor)    4.856  6.004  96.825  <2e-16 ***
## s(koi_smet)   5.843  6.826  97.286  <2e-16 ***
## s(koi_period) 2.696  3.438   4.490  0.2376    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.544   Deviance explained = 46.3%
## -REML = 2458.4  Scale est. = 1         n = 6396

Plot of the splines effect:

par(mfrow = c(2, 3))
for (i in 1:11) {
  plot(gam_model, select = i, shade = TRUE, shade.col = "lightblue")
  abline(h = 0, lty = "dashed")
}

As we can see from the plots, not all the splines are significant. For example koi_period, koi_insol, koi_smass, koi_period, koi_incl, koi_depth and koi_impact are not significant. We will try to remove them from the model.

gam_model <- update(gam_model, . ~ . - s(koi_period) - s(koi_insol) - s(koi_smass) - s(koi_period) - s(koi_incl) - s(koi_depth) - s(koi_impact))
summary(gam_model)
## 
## Family: binomial 
## Link function: logit 
## 
## Formula:
## koi_pdisposition ~ s(koi_teq) + s(koi_slogg) + s(koi_srad) + 
##     s(koi_dor) + s(koi_smet) + koi_duration + koi_prad + koi_model_snr + 
##     koi_steff + koi_ror + koi_srho + koi_ldm_coeff1
## 
## Parametric coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     1.00430    0.06785  14.801  < 2e-16 ***
## koi_duration    1.41230    0.09649  14.637  < 2e-16 ***
## koi_prad       -0.19688    0.26623  -0.739  0.45961    
## koi_model_snr  -0.04401    0.07035  -0.626  0.53163    
## koi_steff       0.23853    0.10635   2.243  0.02491 *  
## koi_ror         7.67484    0.42627  18.004  < 2e-16 ***
## koi_srho        0.02748    0.02981   0.922  0.35658    
## koi_ldm_coeff1  0.19630    0.07053   2.783  0.00538 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                edf Ref.df  Chi.sq p-value    
## s(koi_teq)   5.699  6.838 297.439  <2e-16 ***
## s(koi_slogg) 4.830  5.947  80.271  <2e-16 ***
## s(koi_srad)  6.249  7.163   6.793    0.49    
## s(koi_dor)   8.097  8.784 209.363  <2e-16 ***
## s(koi_smet)  5.997  6.962 148.411  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.539   Deviance explained = 45.7%
## -REML = 2483.8  Scale est. = 1         n = 6396

Now we can see that all the splines are significant. Plot of the splines effect:

par(mfrow = c(2, 3))
for (i in 1:6) {
  plot(gam_model, select = i, shade = TRUE, shade.col = "lightblue")
  abline(h = 0, lty = "dashed")
}

Predict on the test set and evaluate.

gam_probs <- predict(gam_model, newdata = test_data_scaled, type = "response")
gam_probs <- pmax(pmin(gam_probs, 1 - 1e-15), 1e-15)
gam_preds <- factor(ifelse(gam_probs > 0.5, positive_class, levels(test_target_factor)[1]), levels = levels(test_target_factor))

Confusion matrix

cm_gam <- confusionMatrix(gam_preds, test_target_factor, positive = positive_class)
cm_gam
## Confusion Matrix and Statistics
## 
##                 Reference
## Prediction       candidate false_positive
##   candidate            695            165
##   false_positive       111            628
##                                          
##                Accuracy : 0.8274         
##                  95% CI : (0.808, 0.8456)
##     No Information Rate : 0.5041         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.6546         
##                                          
##  Mcnemar's Test P-Value : 0.001422       
##                                          
##             Sensitivity : 0.7919         
##             Specificity : 0.8623         
##          Pos Pred Value : 0.8498         
##          Neg Pred Value : 0.8081         
##              Prevalence : 0.4959         
##          Detection Rate : 0.3927         
##    Detection Prevalence : 0.4622         
##       Balanced Accuracy : 0.8271         
##                                          
##        'Positive' Class : false_positive 
## 

Plot confusion matrix

cm_data <- as.data.frame(cm_gam$table)
# Create heatmap
ggplot(cm_data, aes(x = Reference, y = Prediction, fill = Freq)) +
  geom_tile(colour = "white", linewidth = 0.5) + # Add lines between tiles
  geom_text(aes(label = Freq), color = "white", size = 6) + # Adjust text size
  # Use a color scheme less prone to issues for colorblindness if possible
  scale_fill_gradient(low = "lightblue", high = "darkblue", name = "Frequency") +
  theme_minimal(base_size = 12) + # Set base font size
  labs(
    title = "Confusion Matrix Heatmap", # Corrected title
    x = "Actual Class",
    y = "Predicted Class"
  ) +
  theme(
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5), # Center title
    axis.text.x = element_text(angle = 0, vjust = 1, hjust = 0.5), # Adjust text angle if needed
    axis.title = element_text(size = 14),
    legend.title = element_text(size = 12),
    legend.text = element_text(size = 10)
  )

ROC curve

roc_gam <- roc(response = test_target_factor, predictor = gam_probs, levels = levels(test_target_factor))
print(paste("GAM - AUC:", round(auc(roc_gam), 4)))
## [1] "GAM - AUC: 0.8937"

Plot ROC curve

plot(roc_gam,
  main = "ROC Curve - GAM",
  col = "blue",
  lwd = 2,
  print.auc = TRUE,
  print.auc.y = 0.75,
  print.auc.x = 0.75
)

3.2 Interaction terms

Let’s try adding interaction terms to the original GLM model.

interaction_subset <- c("koi_teq", "koi_slogg", "koi_ror", "koi_smet", "koi_impact")
# Ensure these exist in the data
interaction_subset <- interaction_subset[interaction_subset %in% names(train_data_scaled)]

# Get all other predictors to include as main effects
all_predictors <- setdiff(names(train_data_scaled), target_variable_name)
other_predictors <- setdiff(all_predictors, interaction_subset)

Prepare the formula for the GLM model with interaction terms.

interaction_formula_str <- paste(
  target_variable_name, "~",
  paste0("(", paste(interaction_subset, collapse = " + "), ")^2"), # Pairwise interactions + main effects
  "+",
  paste(other_predictors, collapse = " + ")
) # Add main effects of others
interaction_formula <- as.formula(interaction_formula_str)
print(paste("Using interaction formula:", interaction_formula_str))
## [1] "Using interaction formula: koi_pdisposition ~ (koi_teq + koi_slogg + koi_ror + koi_smet + koi_impact)^2 + koi_period + koi_duration + koi_depth + koi_prad + koi_insol + koi_model_snr + koi_steff + koi_srad + koi_smass + koi_srho + koi_incl + koi_dor + koi_ldm_coeff1"

Fit the GLM model.

glm_interaction_model <- glm(interaction_formula,
  data = train_data_scaled,
  family = binomial(link = "logit")
)
summary(glm_interaction_model)
## 
## Call:
## glm(formula = interaction_formula, family = binomial(link = "logit"), 
##     data = train_data_scaled)
## 
## Coefficients:
##                        Estimate Std. Error    z value Pr(>|z|)    
## (Intercept)           8.837e+14  9.182e+05  962444900   <2e-16 ***
## koi_teq               1.056e+15  1.991e+06  530666658   <2e-16 ***
## koi_slogg             4.326e+14  1.843e+06  234735460   <2e-16 ***
## koi_ror               1.465e+15  4.019e+06  364412212   <2e-16 ***
## koi_smet             -3.529e+14  1.036e+06 -340529745   <2e-16 ***
## koi_impact            9.351e+13  2.577e+06   36282262   <2e-16 ***
## koi_period            2.376e+14  1.521e+06  156148218   <2e-16 ***
## koi_duration          4.758e+14  1.162e+06  409363536   <2e-16 ***
## koi_depth             1.864e+14  1.361e+06  137001244   <2e-16 ***
## koi_prad              2.407e+14  2.072e+06  116192510   <2e-16 ***
## koi_insol             3.147e+13  2.523e+06   12472902   <2e-16 ***
## koi_model_snr        -4.602e+13  1.076e+06  -42785900   <2e-16 ***
## koi_steff            -1.351e+14  1.779e+06  -75962347   <2e-16 ***
## koi_srad              1.993e+14  1.613e+06  123600517   <2e-16 ***
## koi_smass             2.297e+14  1.987e+06  115628797   <2e-16 ***
## koi_srho              3.575e+13  9.337e+05   38291236   <2e-16 ***
## koi_incl              4.230e+13  1.600e+06   26439186   <2e-16 ***
## koi_dor               1.143e+14  1.539e+06   74218505   <2e-16 ***
## koi_ldm_coeff1        1.398e+14  1.364e+06  102506756   <2e-16 ***
## koi_teq:koi_slogg     1.687e+14  5.822e+05  289846990   <2e-16 ***
## koi_teq:koi_ror      -8.763e+14  2.987e+06 -293351502   <2e-16 ***
## koi_teq:koi_smet      2.522e+13  9.968e+05   25295452   <2e-16 ***
## koi_teq:koi_impact   -3.046e+14  2.613e+06 -116592077   <2e-16 ***
## koi_slogg:koi_ror    -3.951e+14  3.320e+06 -119029491   <2e-16 ***
## koi_slogg:koi_smet    1.152e+14  9.377e+05  122906001   <2e-16 ***
## koi_slogg:koi_impact -4.026e+13  2.231e+06  -18047890   <2e-16 ***
## koi_ror:koi_smet     -4.314e+13  1.748e+06  -24684677   <2e-16 ***
## koi_ror:koi_impact   -7.049e+13  1.159e+05 -608474447   <2e-16 ***
## koi_smet:koi_impact  -1.282e+14  1.778e+06  -72140653   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance:   8866.2  on 6395  degrees of freedom
## Residual deviance: 106184.6  on 6367  degrees of freedom
## AIC: 106243
## 
## Number of Fisher Scoring iterations: 25

Predict on the test set and evaluate.

interaction_probs <- predict(glm_interaction_model, newdata = test_data_scaled, type = "response")
interaction_probs <- pmax(pmin(interaction_probs, 1 - 1e-15), 1e-15)
interaction_preds <- factor(ifelse(interaction_probs > 0.5, positive_class, levels(test_target_factor)[1]), levels = levels(test_target_factor))

Confusion matrix

cm_interaction <- confusionMatrix(interaction_preds, test_target_factor, positive = positive_class)
cm_interaction
## Confusion Matrix and Statistics
## 
##                 Reference
## Prediction       candidate false_positive
##   candidate            486             80
##   false_positive       320            713
##                                           
##                Accuracy : 0.7498          
##                  95% CI : (0.7279, 0.7709)
##     No Information Rate : 0.5041          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.5009          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.8991          
##             Specificity : 0.6030          
##          Pos Pred Value : 0.6902          
##          Neg Pred Value : 0.8587          
##              Prevalence : 0.4959          
##          Detection Rate : 0.4459          
##    Detection Prevalence : 0.6460          
##       Balanced Accuracy : 0.7510          
##                                           
##        'Positive' Class : false_positive  
## 

Plot confusion matrix

cm_data <- as.data.frame(cm_interaction$table)
# Create heatmap
ggplot(cm_data, aes(x = Reference, y = Prediction, fill = Freq)) +
  geom_tile(colour = "white", linewidth = 0.5) + # Add lines between tiles
  geom_text(aes(label = Freq), color = "white", size = 6) + # Adjust text size
  # Use a color scheme less prone to issues for colorblindness if possible
  scale_fill_gradient(low = "lightblue", high = "darkblue", name = "Frequency") +
  theme_minimal(base_size = 12) + # Set base font size
  labs(
    title = "Confusion Matrix Heatmap", # Corrected title
    x = "Actual Class",
    y = "Predicted Class"
  ) +
  theme(
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5), # Center title
    axis.text.x = element_text(angle = 0, vjust = 1, hjust = 0.5), # Adjust text angle if needed
    axis.title = element_text(size = 14),
    legend.title = element_text(size = 12),
    legend.text = element_text(size = 10)
  )

ROC curve

roc_glm_interaction <- roc(response = test_target_factor, predictor = interaction_probs, levels = levels(test_target_factor))
print(paste("GLM with interaction - AUC:", round(auc(roc_glm_interaction), 4)))
## [1] "GLM with interaction - AUC: 0.751"

Plot ROC curve

plot(roc_glm_interaction,
  main = "ROC Curve - GLM with interaction",
  col = "blue",
  lwd = 2,
  print.auc = TRUE,
  print.auc.y = 0.75,
  print.auc.x = 0.75
)

4 Lasso Regression

Preapare dataset and formula

train_formula_lasso <- as.formula(paste("~ .")) # Include all predictors
x_train <- model.matrix(train_formula_lasso, data = train_data_scaled[, all_predictors])[, -1] # Predictor matrix, remove intercept
y_train <- train_data_scaled[[target_variable_name]] # Target factor

4.1 Fit the model

set.seed(42) # for reproducibility of CV folds
cv_lasso_fit <- cv.glmnet(x_train, y_train,
  family = "binomial",
  alpha = 1,
  type.measure = "auc", # Use AUC for CV evaluation
  nfolds = 10
)

4.2 Plot the results

plot(cv_lasso_fit)

4.3 Evaluate the model

best_lambda <- cv_lasso_fit$lambda.min
x_test <- model.matrix(train_formula_lasso, data = test_data_scaled[, all_predictors])[, -1]

# Evaluate using the best lambda found by CV
lasso_probs <- predict(cv_lasso_fit, newx = x_test, s = best_lambda, type = "response")
lasso_probs <- as.vector(pmax(pmin(lasso_probs, 1 - 1e-15), 1e-15)) # Ensure stability
lasso_preds <- factor(ifelse(lasso_probs > 0.5, positive_class, levels(test_target_factor)[1]), levels = levels(test_target_factor))

Confusion matrix

lasso_cm <- confusionMatrix(lasso_preds, test_target_factor, positive = positive_class)
lasso_cm
## Confusion Matrix and Statistics
## 
##                 Reference
## Prediction       candidate false_positive
##   candidate            722            222
##   false_positive        84            571
##                                           
##                Accuracy : 0.8086          
##                  95% CI : (0.7885, 0.8276)
##     No Information Rate : 0.5041          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.6167          
##                                           
##  Mcnemar's Test P-Value : 4.811e-15       
##                                           
##             Sensitivity : 0.7201          
##             Specificity : 0.8958          
##          Pos Pred Value : 0.8718          
##          Neg Pred Value : 0.7648          
##              Prevalence : 0.4959          
##          Detection Rate : 0.3571          
##    Detection Prevalence : 0.4096          
##       Balanced Accuracy : 0.8079          
##                                           
##        'Positive' Class : false_positive  
## 

Plot confusion matrix

cm_data <- as.data.frame(lasso_cm$table)
# Create heatmap
ggplot(cm_data, aes(x = Reference, y = Prediction, fill = Freq)) +
  geom_tile(colour = "white", linewidth = 0.5) + # Add lines between tiles
  geom_text(aes(label = Freq), color = "white", size = 6) + # Adjust text size
  # Use a color scheme less prone to issues for colorblindness if possible
  scale_fill_gradient(low = "lightblue", high = "darkblue", name = "Frequency") +
  theme_minimal(base_size = 12) + # Set base font size
  labs(
    title = "Confusion Matrix Heatmap", # Corrected title
    x = "Actual Class",
    y = "Predicted Class"
  ) +
  theme(
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5), # Center title
    axis.text.x = element_text(angle = 0, vjust = 1, hjust = 0.5), # Adjust text angle if needed
    axis.title = element_text(size = 14),
    legend.title = element_text(size = 12),
    legend.text = element_text(size = 10)
  )

ROC curve

roc_lasso <- roc(response = test_target_factor, predictor = lasso_probs, levels = levels(test_target_factor))
print(paste("Lasso - AUC:", round(auc(roc_lasso), 4)))
## [1] "Lasso - AUC: 0.8816"

Plot ROC curve

plot(roc_lasso,
  main = "ROC Curve - Lasso",
  col = "blue",
  lwd = 2,
  print.auc = TRUE,
  print.auc.y = 0.75,
  print.auc.x = 0.75
)

5 Ridge regression

5.1 Fit the model

set.seed(42) # for reproducibility of CV folds
cv_ridge_fit <- cv.glmnet(x_train, y_train,
  family = "binomial",
  alpha = 0, # Set alpha to 0 for Ridge
  type.measure = "auc", # Use AUC for CV evaluation
  nfolds = 10
)

5.2 Plot the results

plot(cv_ridge_fit)

5.3 Evaluate the model

best_lambda_ridge <- cv_ridge_fit$lambda.min
ridge_probs <- predict(cv_ridge_fit, newx = x_test, s = best_lambda_ridge, type = "response")
ridge_probs <- as.vector(pmax(pmin(ridge_probs, 1 - 1e-15), 1e-15)) # Ensure stability
ridge_preds <- factor(ifelse(ridge_probs > 0.5, positive_class, levels(test_target_factor)[1]), levels = levels(test_target_factor))

Confusion matrix

ridge_cm <- confusionMatrix(ridge_preds, test_target_factor, positive = positive_class)
ridge_cm
## Confusion Matrix and Statistics
## 
##                 Reference
## Prediction       candidate false_positive
##   candidate            717            214
##   false_positive        89            579
##                                           
##                Accuracy : 0.8105          
##                  95% CI : (0.7904, 0.8294)
##     No Information Rate : 0.5041          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.6205          
##                                           
##  Mcnemar's Test P-Value : 1.051e-12       
##                                           
##             Sensitivity : 0.7301          
##             Specificity : 0.8896          
##          Pos Pred Value : 0.8668          
##          Neg Pred Value : 0.7701          
##              Prevalence : 0.4959          
##          Detection Rate : 0.3621          
##    Detection Prevalence : 0.4178          
##       Balanced Accuracy : 0.8099          
##                                           
##        'Positive' Class : false_positive  
## 

Plot confusion matrix

cm_data <- as.data.frame(lasso_cm$table)
# Create heatmap
ggplot(cm_data, aes(x = Reference, y = Prediction, fill = Freq)) +
  geom_tile(colour = "white", linewidth = 0.5) + # Add lines between tiles
  geom_text(aes(label = Freq), color = "white", size = 6) + # Adjust text size
  # Use a color scheme less prone to issues for colorblindness if possible
  scale_fill_gradient(low = "lightblue", high = "darkblue", name = "Frequency") +
  theme_minimal(base_size = 12) + # Set base font size
  labs(
    title = "Confusion Matrix Heatmap", # Corrected title
    x = "Actual Class",
    y = "Predicted Class"
  ) +
  theme(
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5), # Center title
    axis.text.x = element_text(angle = 0, vjust = 1, hjust = 0.5), # Adjust text angle if needed
    axis.title = element_text(size = 14),
    legend.title = element_text(size = 12),
    legend.text = element_text(size = 10)
  )

ROC Curve

roc_ridge <- roc(response = test_target_factor, predictor = ridge_probs, levels = levels(test_target_factor))
print(paste("Ridge Regression - AUC:", round(auc(roc_ridge), 4)))
## [1] "Ridge Regression - AUC: 0.8828"

Plot ROC curve

plot(roc_ridge,
  main = "ROC Curve - Lasso",
  col = "blue",
  lwd = 2,
  print.auc = TRUE,
  print.auc.y = 0.75,
  print.auc.x = 0.75
)

6 Save model results for comparison

# Create data frame with model performance metrics
models_performance <- data.frame(
  Model = c("GLM", "GAM", "GLM with Interactions", "Lasso", "Ridge"),
  Accuracy = c(
    cm$overall["Accuracy"],
    cm_gam$overall["Accuracy"],
    cm_interaction$overall["Accuracy"],
    lasso_cm$overall["Accuracy"],
    ridge_cm$overall["Accuracy"]
  ),
  Sensitivity = c(
    cm$byClass["Sensitivity"],
    cm_gam$byClass["Sensitivity"],
    cm_interaction$byClass["Sensitivity"],
    lasso_cm$byClass["Sensitivity"],
    ridge_cm$byClass["Sensitivity"]
  ),
  Specificity = c(
    cm$byClass["Specificity"],
    cm_gam$byClass["Specificity"],
    cm_interaction$byClass["Specificity"],
    lasso_cm$byClass["Specificity"],
    ridge_cm$byClass["Specificity"]
  ),
  AUC = c(
    auc(roc_original),
    auc(roc_gam),
    auc(roc_glm_interaction),
    auc(roc_lasso),
    auc(roc_ridge)
  )
)

# Round numeric columns to 4 decimal places
models_performance[, 2:5] <- round(models_performance[, 2:5], 4)

# save to Rda
save(models_performance, file = "data/Rdas/models_performance.Rda")

# Display the results
models_performance